In [1]:
import pandas as pd 
import numpy as np 
import plotly.express as px 
from matplotlib import pyplot as plt
import seaborn as sns

pd.options.display.float_format ='{:,.2f}'.format
pd.options.display.max_rows = None
pd.options.display.max_columns = None
In [2]:
def outlire(data):
    min = data.min()
    q1=  data.quantile(.25)
    q2=   data.median()
    q3= data.quantile(.75)
    iqr= q3-q1
    lb=q1-(1.5*iqr)
    ub=q3+(1.5*iqr)
    print("lb",lb)
    print("ub",ub)
In [3]:
df=pd.read_csv("madrid_cleaned_data.csv")
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21177 entries, 0 to 21176
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         21177 non-null  int64  
 1   id                 21177 non-null  int64  
 2   district           21177 non-null  object 
 3   sq_mt_built        21177 non-null  float64
 4   n_rooms            21177 non-null  int64  
 5   n_bathrooms        21177 non-null  int64  
 6   buy_price          21177 non-null  int64  
 7   buy_price_by_area  21177 non-null  int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 1.3+ MB
In [4]:
df.head()
Out[4]:
Unnamed: 0 id district sq_mt_built n_rooms n_bathrooms buy_price buy_price_by_area
0 0 21742 San Cristóbal 64.00 2 1 85000 1328
1 1 21741 Los Ángeles 70.00 3 1 129900 1856
2 2 21740 San Andrés 94.00 2 2 144247 1535
3 3 21739 San Andrés 64.00 2 1 109900 1717
4 4 21738 Los Rosales 108.00 2 2 260000 2407
In [5]:
df.describe()
Out[5]:
Unnamed: 0 id sq_mt_built n_rooms n_bathrooms buy_price buy_price_by_area
count 21,177.00 21,177.00 21,177.00 21,177.00 21,177.00 21,177.00 21,177.00
mean 10,885.42 10,856.58 148.65 3.04 2.08 644,503.66 4,022.38
std 6,292.32 6,292.32 134.65 1.42 1.34 747,510.50 1,910.83
min 0.00 1.00 16.00 1.00 1.00 36,000.00 536.00
25% 5,419.00 5,413.00 71.00 2.00 1.00 199,000.00 2,550.00
50% 10,919.00 10,823.00 101.00 3.00 2.00 379,000.00 3,723.00
75% 16,329.00 16,323.00 165.00 4.00 2.00 765,000.00 5,000.00
max 21,741.00 21,742.00 999.00 24.00 14.00 8,800,000.00 18,889.00
In [6]:
# numerical columns
plt.figure(figsize=(20,8))
for e , i in enumerate(df.columns[3:]):  #e= index  , i =numercal columns
    plt.subplot(2,3  ,e+1)
    sns.histplot(df[i] ,kde=True)
    plt.title("distribution of " + i)
    plt.tight_layout();
    
    
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
No description has been provided for this image
In [7]:
plt.figure(figsize=(20,8))
for e , i in enumerate(df.columns[3:]):  #e= index  , i =numercal columns
    plt.subplot(2,3  ,e+1)
    sns.boxplot(x=df[i] )
    plt.title("distribution of " + i)
    plt.tight_layout();
    
No description has been provided for this image
In [8]:
print("sq_mt")
outlire(data = df.sq_mt_built)
print("n_rooms")
outlire(data = df.n_rooms)
print("n_bathrooms")
outlire(data = df.n_bathrooms)
print("puy_price")
outlire(data = df.buy_price)
print("buy_price_by_area")
outlire(data = df.buy_price_by_area)
sq_mt
lb -70.0
ub 306.0
n_rooms
lb -1.0
ub 7.0
n_bathrooms
lb -0.5
ub 3.5
puy_price
lb -650000.0
ub 1614000.0
buy_price_by_area
lb -1125.0
ub 8675.0
In [ ]:
 
In [9]:
px.bar(df.district.value_counts() , title="Distribution of district")

correlation coeffcient¶

In [11]:
corr=df[["sq_mt_built","n_rooms","n_bathrooms"]]
sns.pairplot(corr ,kind="reg")
plt.tight_layout();
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\dell\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\dell\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

No description has been provided for this image
In [12]:
sns.heatmap(corr.corr() ,annot=True)
plt.title("corrleation cofficient between numerical columns")
Out[12]:
Text(0.5, 1.0, 'corrleation cofficient between numerical columns')
No description has been provided for this image
In [13]:
corr=df[["buy_price","sq_mt_built","n_rooms","n_bathrooms"]]
sns.heatmap(corr.corr() ,annot=True)
plt.title("corrleation cofficient between numerical columns")
Out[13]:
Text(0.5, 1.0, 'corrleation cofficient between numerical columns')
No description has been provided for this image
In [14]:
sns.regplot(data=df ,x="sq_mt_built",y="buy_price")
plt.title("correlation between area (m2) and buy price")
plt.xlabel("area (m2)")
plt.ylabel("buy price");
No description has been provided for this image
In [15]:
df.head()
Out[15]:
Unnamed: 0 id district sq_mt_built n_rooms n_bathrooms buy_price buy_price_by_area
0 0 21742 San Cristóbal 64.00 2 1 85000 1328
1 1 21741 Los Ángeles 70.00 3 1 129900 1856
2 2 21740 San Andrés 94.00 2 2 144247 1535
3 3 21739 San Andrés 64.00 2 1 109900 1717
4 4 21738 Los Rosales 108.00 2 2 260000 2407

location or size:what influences house prices in madride¶

do housing prices vary by district if so ,which are the most expensive for purchasin a home ?

In [17]:
mean_price =df.groupby("district")["buy_price"].mean().round(2).sort_values(ascending=False)
mean_price.head(10).plot(kind="bar" ,xlabel ="District" ,ylabel="price",title="mean of puy_price")
Out[17]:
<Axes: title={'center': 'mean of puy_price'}, xlabel='District', ylabel='price'>
No description has been provided for this image
In [18]:
mean_price =df.groupby("district")["buy_price_by_area"].mean().round(2).sort_values(ascending=False)
mean_price.head(10).plot(kind="bar" ,xlabel ="District" ,ylabel="price",title="mean of buy_price_by_area")
Out[18]:
<Axes: title={'center': 'mean of buy_price_by_area'}, xlabel='District', ylabel='price'>
No description has been provided for this image
In [19]:
reco = df.query('district == "Recoletos"')
In [20]:
reco.sample(10)
Out[20]:
Unnamed: 0 id district sq_mt_built n_rooms n_bathrooms buy_price buy_price_by_area
18698 19224 2518 Recoletos 84.00 2 2 730000 8690
19539 20074 1668 Recoletos 247.00 4 2 1700000 6883
18747 19274 2468 Recoletos 219.00 5 2 1970000 8995
18774 19301 2441 Recoletos 176.00 3 3 1460000 8295
19221 19752 1990 Recoletos 232.00 3 2 1950000 8405
19408 19943 1799 Recoletos 221.00 4 4 1700000 7692
19192 19723 2019 Recoletos 241.00 4 4 2500000 10373
19068 19598 2144 Recoletos 220.00 4 6 2400000 10909
19160 19691 2051 Recoletos 247.00 2 3 2200000 8907
18917 19445 2297 Recoletos 404.00 5 6 4350000 10767
In [21]:
import statsmodels.api as sm
reco['intercept'] = 1
lm = sm.OLS(reco["buy_price"] , reco[["intercept",'sq_mt_built']])
result = lm.fit()
result.summary()
C:\Users\dell\AppData\Local\Temp\ipykernel_17720\337612646.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[21]:
OLS Regression Results
Dep. Variable: buy_price R-squared: 0.833
Model: OLS Adj. R-squared: 0.832
Method: Least Squares F-statistic: 555.1
Date: Thu, 09 Jan 2025 Prob (F-statistic): 5.30e-45
Time: 17:29:55 Log-Likelihood: -1654.1
No. Observations: 113 AIC: 3312.
Df Residuals: 111 BIC: 3318.
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
intercept -4.342e+04 1.07e+05 -0.406 0.686 -2.55e+05 1.69e+05
sq_mt_built 8975.6204 380.945 23.561 0.000 8220.753 9730.488
Omnibus: 11.498 Durbin-Watson: 2.039
Prob(Omnibus): 0.003 Jarque-Bera (JB): 16.048
Skew: 0.512 Prob(JB): 0.000328
Kurtosis: 4.536 Cond. No. 574.


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [22]:
# المعاملات من الجدول
b0 = -43420  # الثابت
b1 = 8975.62  # معامل sq_mt_built

# قيمة x
x = 300

# حساب السعر
price = b0 + b1 * x
print("السعر المتوقع:", price)

print("predicted price of flat in recoletos with are of {} m2 = {} euro".format(x,price))
السعر المتوقع: 2649266.0000000005
predicted price of flat in recoletos with are of 300 m2 = 2649266.0000000005 euro

MULTILINER Regression model¶

  • with quantitive data
In [24]:
reco.head()
Out[24]:
Unnamed: 0 id district sq_mt_built n_rooms n_bathrooms buy_price buy_price_by_area intercept
18576 19102 2640 Recoletos 114.00 4 1 750000 6579 1
18580 19106 2636 Recoletos 250.00 3 4 2990000 11960 1
18584 19110 2632 Recoletos 187.00 3 3 1450000 7754 1
18591 19117 2625 Recoletos 417.00 7 6 2500000 5995 1
18594 19120 2622 Recoletos 127.00 2 2 990000 7795 1
In [25]:
corr = reco[["buy_price","sq_mt_built","n_rooms","n_bathrooms"]]
sns.heatmap(corr.corr(),annot=True)
plt.title("corrleation confficient numerical colums");
No description has been provided for this image
In [26]:
lm = sm.OLS(reco["buy_price"] , reco[["intercept",'sq_mt_built' ,"n_rooms" , "n_bathrooms"]])
result = lm.fit()
result.summary()
Out[26]:
OLS Regression Results
Dep. Variable: buy_price R-squared: 0.871
Model: OLS Adj. R-squared: 0.867
Method: Least Squares F-statistic: 245.2
Date: Thu, 09 Jan 2025 Prob (F-statistic): 2.69e-48
Time: 17:29:56 Log-Likelihood: -1639.7
No. Observations: 113 AIC: 3287.
Df Residuals: 109 BIC: 3298.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
intercept 1.068e+05 1.29e+05 0.828 0.409 -1.49e+05 3.62e+05
sq_mt_built 9686.1051 622.560 15.559 0.000 8452.212 1.09e+04
n_rooms -2.292e+05 5.01e+04 -4.573 0.000 -3.29e+05 -1.3e+05
n_bathrooms 1.679e+05 4.41e+04 3.804 0.000 8.04e+04 2.55e+05
Omnibus: 13.677 Durbin-Watson: 2.217
Prob(Omnibus): 0.001 Jarque-Bera (JB): 23.442
Skew: 0.517 Prob(JB): 8.12e-06
Kurtosis: 4.977 Cond. No. 804.


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

الجدول الذي بالاعلي لا يحتوي علي نتائج دقيقه وهذا بسبب عدم دقه n_room¶

In [28]:
lm = sm.OLS(reco["buy_price"] , reco[["intercept",'sq_mt_built'  , "n_bathrooms"]])
result = lm.fit()
result.summary()
Out[28]:
OLS Regression Results
Dep. Variable: buy_price R-squared: 0.846
Model: OLS Adj. R-squared: 0.843
Method: Least Squares F-statistic: 302.6
Date: Thu, 09 Jan 2025 Prob (F-statistic): 1.92e-45
Time: 17:29:56 Log-Likelihood: -1649.6
No. Observations: 113 AIC: 3305.
Df Residuals: 110 BIC: 3313.
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
intercept -2.139e+05 1.18e+05 -1.819 0.072 -4.47e+05 1.91e+04
sq_mt_built 7863.2947 519.692 15.131 0.000 6833.387 8893.202
n_bathrooms 1.442e+05 4.76e+04 3.028 0.003 4.98e+04 2.39e+05
Omnibus: 23.707 Durbin-Watson: 2.165
Prob(Omnibus): 0.000 Jarque-Bera (JB): 66.942
Skew: 0.698 Prob(JB): 2.91e-15
Kurtosis: 6.503 Cond. No. 668.


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [29]:
area = 300
bath = 3
price = 121200 + 3895.9 * area + 196000 *bath
print("pradicted price of falt in Recoletos with area of {} m2 and {} bathrooms = {} Euro".format(area,bath,price))
pradicted price of falt in Recoletos with area of 300 m2 and 3 bathrooms = 1877970.0 Euro
In [30]:
lm = sm.OLS(reco.buy_price,reco[["intercept","n_rooms","n_bathrooms"]])
result =lm.fit()
result.summary()
Out[30]:
OLS Regression Results
Dep. Variable: buy_price R-squared: 0.584
Model: OLS Adj. R-squared: 0.577
Method: Least Squares F-statistic: 77.32
Date: Thu, 09 Jan 2025 Prob (F-statistic): 1.07e-21
Time: 17:29:56 Log-Likelihood: -1705.8
No. Observations: 113 AIC: 3418.
Df Residuals: 110 BIC: 3426.
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
intercept -3.288e+05 2.25e+05 -1.462 0.147 -7.74e+05 1.17e+05
n_rooms 2.701e+05 6.88e+04 3.927 0.000 1.34e+05 4.06e+05
n_bathrooms 4.866e+05 6.98e+04 6.968 0.000 3.48e+05 6.25e+05
Omnibus: 67.811 Durbin-Watson: 2.128
Prob(Omnibus): 0.000 Jarque-Bera (JB): 364.137
Skew: 1.987 Prob(JB): 8.49e-80
Kurtosis: 10.845 Cond. No. 14.4


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [31]:
room =300
bath =3
price = 208900 + 76030 * room +294200 *bath
print("pradicted price of falt in Recoletos with area of {} m2 and {} bathrooms = {} Euro".format(area,bath,price))
pradicted price of falt in Recoletos with area of 300 m2 and 3 bathrooms = 23900500 Euro

--DUMMY VARIABLES¶

In [33]:
df.head()
Out[33]:
Unnamed: 0 id district sq_mt_built n_rooms n_bathrooms buy_price buy_price_by_area
0 0 21742 San Cristóbal 64.00 2 1 85000 1328
1 1 21741 Los Ángeles 70.00 3 1 129900 1856
2 2 21740 San Andrés 94.00 2 2 144247 1535
3 3 21739 San Andrés 64.00 2 1 109900 1717
4 4 21738 Los Rosales 108.00 2 2 260000 2407
In [34]:
df.district.nunique()
Out[34]:
146
In [35]:
df_dummy = df.query(' district in ("Recoletos","Castellana","Trafalgar") ')
df_dummy.head()
Out[35]:
Unnamed: 0 id district sq_mt_built n_rooms n_bathrooms buy_price buy_price_by_area
10604 10937 10805 Trafalgar 68.00 2 1 510000 7500
10667 11000 10742 Trafalgar 103.00 2 2 980000 9515
10668 11001 10741 Trafalgar 103.00 2 2 960000 9320
10676 11010 10732 Trafalgar 95.00 4 1 239000 2516
10708 11042 10700 Trafalgar 67.00 2 2 550000 8209
In [36]:
pd.get_dummies(df_dummy.district).head()
Out[36]:
Castellana Recoletos Trafalgar
10604 False False True
10667 False False True
10668 False False True
10676 False False True
10708 False False True
In [37]:
df_dummy[["Cast","Reco","Traf"]]=pd.get_dummies(df_dummy["district"], dtype="int")
df_dummy.head()
C:\Users\dell\AppData\Local\Temp\ipykernel_17720\3200272759.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\dell\AppData\Local\Temp\ipykernel_17720\3200272759.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\dell\AppData\Local\Temp\ipykernel_17720\3200272759.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[37]:
Unnamed: 0 id district sq_mt_built n_rooms n_bathrooms buy_price buy_price_by_area Cast Reco Traf
10604 10937 10805 Trafalgar 68.00 2 1 510000 7500 0 0 1
10667 11000 10742 Trafalgar 103.00 2 2 980000 9515 0 0 1
10668 11001 10741 Trafalgar 103.00 2 2 960000 9320 0 0 1
10676 11010 10732 Trafalgar 95.00 4 1 239000 2516 0 0 1
10708 11042 10700 Trafalgar 67.00 2 2 550000 8209 0 0 1
In [38]:
df_dummy ["intercept"] =1
lm = sm.OLS(df_dummy.buy_price, df_dummy[["intercept","Cast","Traf"]])
result =lm.fit()
result.summary()
C:\Users\dell\AppData\Local\Temp\ipykernel_17720\3764759753.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[38]:
OLS Regression Results
Dep. Variable: buy_price R-squared: 0.363
Model: OLS Adj. R-squared: 0.360
Method: Least Squares F-statistic: 114.3
Date: Thu, 09 Jan 2025 Prob (F-statistic): 5.20e-40
Time: 17:29:56 Log-Likelihood: -6095.1
No. Observations: 404 AIC: 1.220e+04
Df Residuals: 401 BIC: 1.221e+04
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
intercept 2.155e+06 8.15e+04 26.448 0.000 1.99e+06 2.32e+06
Cast -7.785e+05 1.27e+05 -6.129 0.000 -1.03e+06 -5.29e+05
Traf -1.513e+06 1.01e+05 -14.998 0.000 -1.71e+06 -1.31e+06
Omnibus: 158.066 Durbin-Watson: 2.012
Prob(Omnibus): 0.000 Jarque-Bera (JB): 761.523
Skew: 1.636 Prob(JB): 4.34e-166
Kurtosis: 8.876 Cond. No. 4.29


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [ ]: